### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### This file matches MLS and Corelogic data. Note that running this matching algorithm on the sampled and randomized raw data in the replication package yields few matches.


library(data.table)
library(lfe)
library(stargazer)
library(survival)
library(zoo)
library(ggplot2)
library(Hmisc)


match_mls_corelogic <- function() {
  
  # processed data in
  mls.in <- fread('../data/processed/mls/share/combined_processed.csv')
  cor.in <- fread('../data/processed/corelogic/share/Combined_processed.csv')
  
  # The idea is to merge the mls stuff onto the corelogic data + price info + match info from MLS.
  # Merge on long and lat. 4 decimal places, which is precise at ~10 meters
  mls.in <- mls.in[!is.na(property_latitude) & !is.na(property_longitude)]
  cor.in <- cor.in[!is.na(parcellevellatitude) & !is.na(parcellevellongitude)]
  mls.in[,loc.merge := paste(round(property_latitude,4),round(property_longitude,4),sep='_x_')]
  cor.in[,loc.merge := paste(round(parcellevellatitude,4),round(parcellevellongitude,4),sep='_x_')]
  
  # Merge them all, keeping the corelogic as base, and take the closest match
  merged.all <- merge(cor.in,mls.in[!is.na(property_latitude),c('loc.merge','price','status','record_date_time')],by=c('loc.merge'),all.cartesian=T,all.x=T)
  merged.all[,cor.date := as.Date(date)]
  merged.all[,mls.date := as.Date(record_date_time)]
  merged.all[,cor.month := as.yearmon(cor.date) ]
  merged.all[,mls.month := as.yearmon(mls.date)]
  merged.all[,date.diff := as.numeric(abs(cor.date - mls.date))]
  merged.all[,best.match := min(date.diff),by=c('loc.merge','cor.date')]  
  merged.all <- merged.all[is.na(date.diff) | date.diff == best.match]
  
  merged.all[,month.match := as.numeric(cor.month == mls.month)]
  merged.all[is.na(month.match),month.match := 0]
  
  merged.all[,qtr.match := as.numeric(as.yearqtr(cor.date)==as.yearqtr(mls.date))]
  merged.all[is.na(qtr.match),qtr.match := 0]
  
  merged.all[,day.match := as.numeric(cor.date == mls.date)]
  merged.all[is.na(day.match),day.match := 0]
  
  merged.all[,week.match := as.numeric(date.diff < 7)]
  merged.all[is.na(week.match),week.match := 0]
  
  
  write.table(merged.all,'../data/processed/mls/share/corelogic_mls_merged.csv',row.names=F,col.names=T,sep = '|')
  
  
}


match_mls_corelogic()